a = "Hello @Text Mining World! I'm here to learn, right?"
a

"Hello @Text Mining World! I'm here to learn, right?"

print(a[0]) # if you do not use the print function, it will print only the last argument in the cell
print(a[31])
l = len(a)
print("Length of your string is: ", l)
print(a[l-1])

H
e
Length of your string is:  51
?

!pip install -q numpy
!pip install -q nltk
!pip install -q gensim
!pip install -q spacy

import nltk
b = a.lower()
b

"hello @text mining world! i'm here to learn, right?"

import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~

# Remember there are many ways to remove punctuations! This is only one of them:
c = "".join([char for char in b if char not in string.punctuation])
print(c)

hello text mining world im here to learn right

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize(b)

['hello', 'text', 'mining', 'world', 'i', 'm', 'here', 'to', 'learn', 'right']

import pandas as pd
ts_lyrics = pd.read_csv("data/taylor_swift_lyrics.csv")

ts_lyrics.head()

ts_lyrics.tail()

ts_lyrics.iloc[0]

Artist                                         Taylor Swift 
Album                                          Taylor Swift 
Title                                             Tim McGraw
Lyrics     He said the way my blue eyes shinx\nPut those ...
Name: 0, dtype: object

ts_lyrics.head(1)

import re
from IPython.display import display

love_lyrics = ts_lyrics[ts_lyrics['Lyrics'].str.contains(r'\blove\b')]
display(love_lyrics)

ts_lyrics['love_count'] = ts_lyrics['Lyrics'].str.count(r'\blove\b', flags=re.IGNORECASE)
print(ts_lyrics[['Lyrics', 'love_count']])

                                                Lyrics  love_count
0    He said the way my blue eyes shinx\nPut those ...           0
1    State the obvious, I didn't get my perfect fan...           2
2    Drew looks at me,\nI fake a smile so he won't ...           2
3    I don't know what I want, so don't ask me\n'Ca...           0
4    You have a way of coming easily to me\nAnd whe...           0
..                                                 ...         ...
127  What did you think I'd say to that?\nDoes a sc...           0
128  Keep your helmet\nKeep your life, son\nJust a ...           0
129  Betty, I won't make assumptions about why you ...           1
130  Our coming of age has come and gone\nSuddenly ...           2
131  My only one\nMy smoking gun\nMy eclipsed sun\n...           2

[132 rows x 2 columns]

ts_lyrics['four_letter_words'] = ts_lyrics['Lyrics'].str.findall(r'\b\w{4}\b')
print(ts_lyrics[['Lyrics', 'four_letter_words']])

                                                Lyrics  \
0    He said the way my blue eyes shinx\nPut those ...   
1    State the obvious, I didn't get my perfect fan...   
2    Drew looks at me,\nI fake a smile so he won't ...   
3    I don't know what I want, so don't ask me\n'Ca...   
4    You have a way of coming easily to me\nAnd whe...   
..                                                 ...   
127  What did you think I'd say to that?\nDoes a sc...   
128  Keep your helmet\nKeep your life, son\nJust a ...   
129  Betty, I won't make assumptions about why you ...   
130  Our coming of age has come and gone\nSuddenly ...   
131  My only one\nMy smoking gun\nMy eclipsed sun\n...   

                                     four_letter_words  
0    [said, blue, eyes, that, said, That, Just, Tha...  
1    [didn, love, more, than, ever, love, tell, you...  
2    [Drew, fake, What, want, what, need, that, Tha...  
3    [know, what, want, know, what, down, this, roa...  
4    [have, when, take, take, very, best, need, fee...  
..                                                 ...  
127  [What, that, Does, when, back, They, kill, kno...  
128  [Keep, your, Keep, your, life, Just, Here, you...  
129  [make, your, time, when, your, like, from, Ine...  
130  [come, gone, this, long, near, just, give, fir...  
131  [only, This, down, This, Give, Your, love, onl...  

[132 rows x 2 columns]

lyrics_with_numbers = ts_lyrics[ts_lyrics['Lyrics'].str.contains(r'\d')]
print(lyrics_with_numbers['Lyrics'])

44     I still remember the look on your face\nLit th...
63     I said, "Oh my, what a marvelous tune"\nIt was...
66     You said it in a simple way\n4AM, the second d...
74     It's 2 A.M. in your car\nWindows down, I pass ...
84     I wanna be your endgame\nI wanna be your first...
89     See you in the dark\nAll eyes on you, my magic...
103    I think he knows his footprints\nOn the sidewa...
111    You are somebody that I don't know\nBut you're...
126    Green was the color of the grass where I used ...
Name: Lyrics, dtype: object

# Reading the file line by line into a list
with open("data/computer.txt", "r", encoding="utf-8") as file:
    computer_data = file.readlines()

# Removing trailing newline characters
computer_data = [line.strip() for line in computer_data]

# Convert to DataFrame
computer_531 = pd.DataFrame(computer_data, columns=["text"])
print(computer_531)

                                                  text
0    ## I purchased this monitor because of budgeta...
1    inexpensive[+1][a] ## This item was the most i...
2    monitor[-1] ## My overall experience with this...
3    screen[-1], picture quality[-1] ## When the sc...
4    monitor[-1], picture quality[-1] ## I 've view...
..                                                 ...
526           ## After that , it worked like a champ .
527                        ## No problems whatsoever .
528  incompatibility[-1] ## My only grips are the i...
529  ## This is a well know problem with the PCs an...
530  ## Also , the only hard button controls you ge...

[531 rows x 1 columns]

# Extract aspects and sentiments
def extract_aspects(line):
    return re.findall(r"(\w+)\[([-+]?\d+)\]", line)

computer_531['aspects_and_sentiments'] = computer_531['text'].apply(extract_aspects)

# Extract review text
def extract_review_text(line):
    # Match everything after '##'
    match = re.search(r'##\s*(.*)', line)
    return match.group(1) if match else None

computer_531['review_text'] = computer_531['text'].apply(extract_review_text)

# Extract only aspects
def get_aspects(aspects_and_sentiments):
    return [aspect for aspect, sentiment in aspects_and_sentiments]

computer_531['aspects'] = computer_531['aspects_and_sentiments'].apply(get_aspects)

# Sum sentiment scores
def sum_sentiments(aspects_and_sentiments):
    return sum(int(sentiment) for _, sentiment in aspects_and_sentiments)

computer_531['summed_sentiment'] = computer_531['aspects_and_sentiments'].apply(sum_sentiments)

# Assign sentiment labels
def assign_sentiment_label(summed_sentiment):
    if summed_sentiment > 0:
        return 'Positive'
    elif summed_sentiment < 0:
        return 'Negative'
    else:
        return 'Neutral'

computer_531['sentiment_label'] = computer_531['summed_sentiment'].apply(assign_sentiment_label)

# Display the final dataframe
display(computer_531)

# Save the dataframe to a CSV file
computer_531.to_csv("data/computer_531_final.csv", index=False)
print("CSV has been saved.")

CSV has been saved.

Practical 1: Introduction to Text Mining¶

Text Mining, Transforming Text into Knowledge (202400006)¶

Google Colab¶

Simple text processing¶

Working with text datasets¶

Taylor Swift Lyrics dataset¶

Computer review dataset¶

	Artist	Album	Title	Lyrics
0	Taylor Swift	Taylor Swift	Tim McGraw	He said the way my blue eyes shinx\nPut those ...
1	Taylor Swift	Taylor Swift	Picture to Burn	State the obvious, I didn't get my perfect fan...
2	Taylor Swift	Taylor Swift	Teardrops on my Guitar	Drew looks at me,\nI fake a smile so he won't ...
3	Taylor Swift	Taylor Swift	A Place in This World	I don't know what I want, so don't ask me\n'Ca...
4	Taylor Swift	Taylor Swift	Cold As You	You have a way of coming easily to me\nAnd whe...

	Artist	Album	Title	Lyrics
127	Taylor Swift	folklore	mad woman	What did you think I'd say to that?\nDoes a sc...
128	Taylor Swift	folklore	epiphany	Keep your helmet\nKeep your life, son\nJust a ...
129	Taylor Swift	folklore	betty	Betty, I won't make assumptions about why you ...
130	Taylor Swift	folklore	peace	Our coming of age has come and gone\nSuddenly ...
131	Taylor Swift	folklore	hoax	My only one\nMy smoking gun\nMy eclipsed sun\n...

	text	aspects_and_sentiments	review_text	aspects	summed_sentiment	sentiment_label
0	## I purchased this monitor because of budgeta...	[]	I purchased this monitor because of budgetary ...	[]	0	Neutral
1	inexpensive[+1][a] ## This item was the most i...	[(inexpensive, +1)]	This item was the most inexpensive 17 inch mon...	[inexpensive]	1	Positive
2	monitor[-1] ## My overall experience with this...	[(monitor, -1)]	My overall experience with this monitor was ve...	[monitor]	-1	Negative
3	screen[-1], picture quality[-1] ## When the sc...	[(screen, -1), (quality, -1)]	When the screen was n't contracting or glitchi...	[screen, quality]	-2	Negative
4	monitor[-1], picture quality[-1] ## I 've view...	[(monitor, -1), (quality, -1)]	I 've viewed numerous different monitor models...	[monitor, quality]	-2	Negative
...	...	...	...	...	...	...
526	## After that , it worked like a champ .	[]	After that , it worked like a champ .	[]	0	Neutral
527	## No problems whatsoever .	[]	No problems whatsoever .	[]	0	Neutral
528	incompatibility[-1] ## My only grips are the i...	[(incompatibility, -1)]	My only grips are the incompatibility with XP ...	[incompatibility]	-1	Negative
529	## This is a well know problem with the PCs an...	[]	This is a well know problem with the PCs and A...	[]	0	Neutral
530	## Also , the only hard button controls you ge...	[]	Also , the only hard button controls you get a...	[]	0	Neutral